Figure 3

# set up the environment

library(Seurat)
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(ggplot2)
library(reshape2)
library(CelltypeR)
Warning: replacing previous import ‘data.table::last’ by ‘dplyr::last’ when loading ‘CelltypeR’
Warning: replacing previous import ‘data.table::first’ by ‘dplyr::first’ when loading ‘CelltypeR’
Warning: replacing previous import ‘data.table::between’ by ‘dplyr::between’ when loading ‘CelltypeR’
Warning: replacing previous import ‘dplyr::filter’ by ‘flowCore::filter’ when loading ‘CelltypeR’
Warning: replacing previous import ‘ggplot2::margin’ by ‘randomForest::margin’ when loading ‘CelltypeR’
Warning: replacing previous import ‘dplyr::combine’ by ‘randomForest::combine’ when loading ‘CelltypeR’
Warning: replacing previous import ‘flowCore::filter’ by ‘dplyr::filter’ when loading ‘CelltypeR’
Warning: replacing previous import ‘flowViz::contour’ by ‘graphics::contour’ when loading ‘flowStats’

Attaching package: ‘CelltypeR’

The following object is masked from ‘package:ggplot2’:

    annotate

Figure 3A - predicted correlation matrix

hm <- heatmap(mat, 
        Rowv = NA,  # Don't cluster rows
        Colv = NA,  # Don't cluster columns
        col = colorRampPalette(c("white", "blue"))(100),  # Define a color palette
        main = "Reference Matrix")

hm
$rowInd
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13

$colInd
[1] 1 2 3 4 5 6 7 8 9

$Rowv
NULL

$Colv
NULL

Plot with ggplot Figure 3A


hm

pdf("/Users/rhalenathomas/Documents/Projects_Papers/PhenoID/ForFigures/June2023/ReferenceMatrix.pdf", width = 6.5, height = 5)
hm
dev.off()
quartz_off_screen 
                2 

Calculate correlations

Plot correlations Figure 3B, C and S3

p.cor2 <- plot_corr(cor2, threshold = 0.1, min_cells = 500)
Using X, best.cell.type, second.cell.type, cell.label, cell.label.ft, new_cell.label as id variables
Using X, best.cell.type, second.cell.type, cell.label, cell.label.ft, new_cell.label as id variables
Using X, best.cell.type, second.cell.type, cell.label, cell.label.ft, new_cell.label as id variables
pdf(paste(fig_outs,"CorPlots_thresh01.pdf",sep = ""))
p.cor2
[[1]]

[[2]]

[[3]]

[[4]]
Warning: Removed 1 rows containing missing values (`geom_violin()`).

[[5]]

[[6]]

[[7]]

[[8]]
dev.off()
null device 
          1 

Adjust pair of cell types plots to plot mixes with higher cell counts Figures S4,S5,S6

pdf(paste(fig_outs,"CAMcorpairs_thresh0553.pdf",sep=""))
p1
dev.off()
null device 
          1 
# Define the threshold number of pairs
df <- cor3 # threshold 0.35
threshold <- 150

# Filter the double-labeled cells
double.cells <- df[grep("-", df$cell.label),]
label_counts <- table(double.cells$cell.label)
filters_labels <- names(label_counts[label_counts >= threshold])
# Filter the double-labeled cells based on the filtered labels
filtered_double.cells <- double.cells[double.cells$cell.label %in% filters_labels, ]

# Melt the filtered double-labeled cells for plotting
df.melt.filtered <- melt(filtered_double.cells)
Using X, best.cell.type, second.cell.type, cell.label, cell.label.ft as id variables
# Plot the filtered double-labeled cells
p2 <- ggplot(df.melt.filtered, aes(x = variable, y = value, colour = variable, group = X))+
      geom_line(show.legend = FALSE, size = 0.1, color = "black") +
      geom_point() +
      scale_color_manual(values = c("#4E84C4", "#52854C","purple","orange")) +
      ylim(0.1, 0.8) +
      facet_wrap(~ as.factor(cell.label)) +
      ylab("Correlation Coefficient") +
      xlab("")

p2

pdf(paste(fig_outs,"CAMcorpairs_thresh35.pdf",sep=""))
p2
dev.off()
quartz_off_screen 
                2 

Make plots for Figure 3B and C


plot1b  
plot2
Warning: Removed 3 rows containing missing values (`geom_violin()`).

Figure 3 clustering

This is with the subsample of 9000 cells per hMO Figure 3D and E

Add different correlation assignments and visualize on UMAP from the low threshold for CAM

Visualize CAM with the signifance 0.553 threshold

seu <- AddMetaData(object=seu, metadata= cor1$cell.label, col.name = 'cor.labels05')

# see the labels added
unique(seu$cor.labels05)
 [1] "Unassigned"              "astrocytes"             
 [3] "neurons"                 "RG"                     
 [5] "OPC"                     "Epithelial"             
 [7] "NPC"                     "neurons-NPC"            
 [9] "oligodendrocyte"         "Endothelial"            
[11] "neurons-oligodendrocyte" "OPC-neurons"            
[13] "NPC-stemlike"            "oligodendrocyte-neurons"
[15] "neurons-OPC"             "RG-astrocytes"          
[17] "astrocytes-RG"           "NPC-neurons"            
[19] "oligodendrocyte-NPC"     "stemlike"               
[21] "NPC-oligodendrocyte"     "stemlike-NPC"           
[23] "Epithelial-NPC"          "RG-NPC"                 
[25] "OPC-oligodendrocyte"     "Endothelial-Epithelial" 
[27] "Epithelial-Endothelial"  "Epithelial-stemlike"    
[29] "RG-Epithelial"           "oligodendrocyte-OPC"    
[31] "astrocytes-stemlike"     "RG-oligodendrocyte"     
[33] "neurons-stemlike"        "Epithelial-RG"          
DimPlot(seu, group.by = 'cor.labels05', label = TRUE) + theme(legend.position = "none")

Visualize CAM assignments with threshold R = 0.35


seu <- AddMetaData(object=seu, metadata= cor3$cell.label, col.name = 'cor.labels035')

# see the labels added
#unique(seu$cor.labels01)

# plot the cluster predictions
#plot_lab_clust(seu, seu$RNA_snn_res.0.7, seu$cor.labels01)

DimPlot(seu, group.by = 'cor.labels035', label = TRUE) + theme(legend.position = "none")


# removing the combined cell labels that are low frequency will improve visualization
# Check if the cell type has a frequency less than 500 and assign new label accordingly

# Check if the cell type has a frequency less than 500 and assign new label accordingly
cor3$cell.label.ft <- ifelse(cor3$cell.label %in% freq.cor3.df$cell.label[freq.cor3.df$Freq < 200], "few", cor3$cell.label)


seu <- AddMetaData(object=seu, metadata= cor3$cell.label.ft, col.name = 'cor.labels035ft')
unique(seu$cor.labels035ft)
 [1] "astrocytes"              "RG"                     
 [3] "OPC"                     "neurons"                
 [5] "RG-Epithelial"           "neurons-oligodendrocyte"
 [7] "Endothelial"             "Epithelial-Endothelial" 
 [9] "oligodendrocyte-OPC"     "unassigned"             
[11] "Epithelial"              "NPC"                    
[13] "oligodendrocyte"         "oligodendrocyte-neurons"
[15] "few"                     "Epithelial-RG"          
[17] "neurons-OPC"             "stemlike"               
[19] "OPC-oligodendrocyte"     "RG-astrocytes"          
[21] "astrocytes-RG"           "Endothelial-Epithelial" 
[23] "OPC-neurons"            
DimPlot(seu, group.by = 'cor.labels035ft', label = TRUE, 
        label.size = 8, repel = TRUE) + theme(legend.position = "none")

NA
NA

Get the top assigned cells per cluster for each threshold

cor.ann.035 <- get_annotation(seu, seu.cluster = seu$RNA_snn_res.0.7, 
                          seu.label = seu$cor.labels035, top_n = 3, 
                          filter_out = c("Unknown","unknown","Mixed", 
                                         "unassigned","Unassigned"), 
                          Label = "CAM")
[1] "filtering"
cor.ann.01 <- get_annotation(seu, seu.cluster = seu$RNA_snn_res.0.7, 
                          seu.label = seu$cor.labels01, top_n = 3, 
                          filter_out = c("Unknown","unknown","Mixed","
                                         unassigned","Unassigned"), 
                          Label = "CAM")
[1] "filtering"
cor.ann.05 <- get_annotation(seu, seu.cluster = seu$RNA_snn_res.0.7, 
                          seu.label = seu$cor.labels05, top_n = 3, 
                          filter_out = c("Unknown","unknown","Mixed",
                                         "unassigned","Unassigned"), 
                          Label = "CAM")
[1] "filtering"

Visualize Marker expression

length(unique(seu$RNA_snn_res.0.7))
[1] 19
# 19
# if we want to plot by cluster we need a vector of from 0 to the n-1 clusters
cluster.num <- c(0:18)

plotmean(plot_type = 'heatmap',seu = seu, group = 'RNA_snn_res.0.7', markers = AB, 
                     var_names = cluster.num, slot = 'scale.data', xlab = "Cluster",
         ylab = "Antibody")

NA
NA

Feature plots

Idents(seu) <- "RNA_snn_res.0.7"

for (i in AB) {
  print(FeaturePlot(seu, features = i, min.cutoff = 'q1', max.cutoff = 'q97', label = TRUE))
}

NA
NA
pdf(paste(fig_outs,"HM9000_fig3E.pdf"),width = 8.5, height = 5)
DoHeatmap(seu, features = AB, size= 6,slot = "scale.data", group.colors = clust.colours, disp.max = 2, disp.min = -1.5,
          angle = 90) + scale_fill_gradientn(colors = c("#154c79", "#eeeee4", "#e28743")) + 
  theme(axis.text.y = element_text(size = 15))
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.
dev.off()
null device 
          1 
seu <-readRDS("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/NatMethodJuneSubmission/Seu9000lablesJune23.RDS") 
Warning message:
R graphics engine version 15 is not supported by this version of RStudio. The Plots tab will be disabled until a newer version of RStudio is installed. 

Train a Random Forest classifier


markers <- rev(c("CD24","CD56","CD29","CD15","CD184","CD133","CD71","CD44","GLAST","AQP4","HepaCAM", "CD140a","O4"))
rf <- RFM_train(seurate_object = seu, 
                             AB_list = markers, annotations = seu$labels,
                      split = c(0.5,0.5),
                      downsample = "none",
                      seed = 222,
                      mytry = c(1:10),
                      maxnodes = c(10: 20),
                      trees = c(250, 500, 1000,2000),
                      start_node = 15)
---
title: "R Notebook"
output: html_notebook
---

# Figure 3

```{r}
# set up the environment

library(Seurat)
library(dplyr)
library(ggplot2)
library(reshape2)
library(CelltypeR)

```

Figure 3A - predicted correlation matrix


```{r}
# Figure 3A 
# Plot the reference matrix for correlation
reference_path <- "/Users/rhalenathomas/GITHUB/CelltypeR/ExampleOuts/FinalReferenceMatrix.csv"
reference_data <- read.csv(reference_path)
# the reference matrix need to be in the format cell types as rows and markers as columns
# there is a column X with the markers
#colnames(reference_data)
# we need a row X with the cell type names
# Transpose the dataframe and convert values to numeric

reference_data <- rbind(reference_data, colnames(reference_data))
rownames(reference_data) <- reference_data$X
reference_data <- reference_data %>% select(-"X")
df_ref <- as.data.frame(t(reference_data))
# make it numeric
df_ref2 <- apply(df_ref, 2, as.numeric)
df_ref2 <- as.data.frame(df_ref2)
# add back the cell type column
df_ref2$X <- df_ref$X
colnames(df_ref2)
rownames(df_ref2) <- df_ref2$X
head(df_ref2)
# Plot the reference matrix
rownames(reference_data) <- reference_data$X
df_ref1 <- reference_data %>% select(-"X")
mat <- as.matrix(df_ref1)

hm <- heatmap(mat, 
        Rowv = NA,  # Don't cluster rows
        Colv = NA,  # Don't cluster columns
        col = colorRampPalette(c("white", "blue"))(100),  # Define a color palette
        main = "Reference Matrix")

hm

```

Plot with ggplot Figure 3A

```{r}

colnames(mat) <- c("Astrocyte","Endothelial","Epithelial","Neuron",
                   "NPC","OPC","Oligo","RG","Stem")

marker.order <- rev(c("CD24","CD56","CD29","CD15","CD184","CD133","CD71","CD44","GLAST","AQP4","HepaCAM", "CD140a","O4"))
# reformat df
long_df <- melt(mat, varnames = c("Marker", "Cell_Type"), value.name = "Expression")
long_df$Marker <- factor(long_df$Marker, levels = marker.order)

hm <- ggplot(long_df, aes(x = Cell_Type, y = Marker, fill = Expression)) +
  geom_tile(color = "white") +  # Remove the black border by setting color to "white"
  scale_fill_gradient(low = "seashell2", high = "red3") +
  labs(x = "Cell Type", y = "Marker", fill = "Relative Expression") + 
  theme_bw() +
  theme(panel.border = element_blank(),  # Remove the panel border
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 12, colour = "black"), 
        axis.text.y = element_text(hjust = 1 ,vjust = 0.5, size = 12, colour = "black"),
        axis.title = element_text(colour = "black", size = 14),
        panel.grid = element_blank(),
        axis.ticks = element_blank())  # Remove the panel grid

hm

pdf("/Users/rhalenathomas/Documents/Projects_Papers/PhenoID/ForFigures/June2023/ReferenceMatrix.pdf", width = 6.5, height = 5)
hm
dev.off()


```



Calculate correlations

```{r}

test_path <- "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/prepro_outsretrotransformed_flowset.csv"

test.df <- read.csv(test_path)
dim(test.df)
head(test.df)

# with 13 antibodies R value of 0.553 has a significant p value less than 0.05
cor1 <- find_correlation(test = test.df, reference = df_ref2,
                        min_corr = 0.553, min_diff = 0.05)

write.csv(cor1, "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/correlations/cor9000cellsRthresh0553_june2023B.csv")

cor1 <- read.csv("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/correlations/cor9000cellsRthresh0553_june2023B.csv")


# see how the cells mostly would be assigned if not for the threshold
cor2 <- find_correlation(test = test.df, reference = df_ref2,
                        min_corr = 0.1, min_diff = 0.05)

write.csv(cor2, "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/correlations/cor9000cellsRthresh01_june2023.csv")

head(cor2)

cor3 <- find_correlation(test = test.df, reference = df_ref2,
                        min_corr = 0.35, min_diff = 0.05)

write.csv(cor3, "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/correlations/cor9000cellsRthresh035_june2023.csv")


```

Plot correlations
Figure 3B, C and S3

```{r}

# plot the main groups - and the correlation co-efficient for the assigned group
# plots are created by the plot_corr function, which uses ggplot2
plot_corr(cor1, threshold = 0.553, min_cells = 70) 
plot_corr(cor2, threshold = 0.1, min_cells = 500)
plot_corr(cor3, threshold = 0.35, min_cells = 500)

# save the plots
fig_outs <- "/Users/rhalenathomas/Documents/Projects_Papers/PhenoID/ForFigures/June2023/"

p.cor1 <- plot_corr(cor1, threshold = 0.553, min_cells = 70)

pdf(paste(fig_outs,"CorPlots_thresh0553.pdf",sep = ""))
p.cor1
dev.off()

p.cor2 <- plot_corr(cor2, threshold = 0.1, min_cells = 500)



pdf(paste(fig_outs,"CorPlots_thresh01.pdf",sep = ""))
p.cor2
dev.off()
#How many cells are there in each category - this is in the frequency table
freq.cor2.df <- as.data.frame(p.cor2[[1]])

p.cor3 <- plot_corr(cor3, threshold = 0.35, min_cells = 500)

pdf(paste(fig_outs,"CorPlots_thresh035.pdf",sep = ""))
p.cor3
dev.off()
#How many cells are there in each category - this is in the frequency table
freq.cor3.df <- as.data.frame(p.cor3[[1]])



```
Adjust pair of cell types plots to plot mixes with higher cell counts
Figures S4,S5,S6 

```{r}
# Define the threshold number of pairs
df <- cor1 # threshold 0.553
threshold <- 8

# Filter the double-labeled cells
double.cells <- df[grep("-", df$cell.label),]
label_counts <- table(double.cells$cell.label)
filters_labels <- names(label_counts[label_counts >= threshold])
# Filter the double-labeled cells based on the filtered labels
filtered_double.cells <- double.cells[double.cells$cell.label %in% filters_labels, ]

# Melt the filtered double-labeled cells for plotting
df.melt.filtered <- melt(filtered_double.cells)

# Plot the filtered double-labeled cells
p1 <- ggplot(df.melt.filtered, aes(x = variable, y = value, colour = variable, group = X))+
      geom_line(show.legend = FALSE, size = 0.1, color = "black") +
      geom_point() +
      scale_color_manual(values = c("#4E84C4", "#52854C","purple","orange")) +
      ylim(0, 1) +
      facet_wrap(~ as.factor(cell.label)) +
      ylab("Correlation Coefficient") +
      xlab("")

p1

pdf(paste(fig_outs,"CAMcorpairs_thresh0553.pdf",sep=""))
p1
dev.off()

```


```{r}
# Define the threshold number of pairs
df <- cor3 # threshold 0.35
threshold <- 150

# Filter the double-labeled cells
double.cells <- df[grep("-", df$cell.label),]
label_counts <- table(double.cells$cell.label)
filters_labels <- names(label_counts[label_counts >= threshold])
# Filter the double-labeled cells based on the filtered labels
filtered_double.cells <- double.cells[double.cells$cell.label %in% filters_labels, ]

# Melt the filtered double-labeled cells for plotting
df.melt.filtered <- melt(filtered_double.cells)

# Plot the filtered double-labeled cells
p2 <- ggplot(df.melt.filtered, aes(x = variable, y = value, colour = variable, group = X))+
      geom_line(show.legend = FALSE, size = 0.1, color = "black") +
      geom_point() +
      scale_color_manual(values = c("#4E84C4", "#52854C","purple","orange")) +
      ylim(0.1, 0.8) +
      facet_wrap(~ as.factor(cell.label)) +
      ylab("Correlation Coefficient") +
      xlab("")

p2

pdf(paste(fig_outs,"CAMcorpairs_thresh35.pdf",sep=""))
p2
dev.off()

```



Make plots for Figure 3B and C

```{r}

# plotting script from inside the function above
threshold = 0.553
min_cells = 55

# violin plot
df.filter <- cor1 %>% group_by(cell.label) %>% dplyr::filter(n()> min_cells)

plot2 <- ggplot(df.filter, aes(x = best.cell.type, y = cor.1, fill = best.cell.type)) +
    geom_violin(trim = FALSE) +
    ylim(0, 1) +
    theme_classic() +
    theme(text = element_text(size = 14), axis.text.x = element_text(angle = 90, size = 12)) +
    ylab("correlation coefficient") +
    xlab("Cell type with max correlation coefficient") +
    geom_hline(yintercept = threshold) +
    guides(fill = guide_legend(title = "Cell Type"))


# save
pdf(paste(fig_outs,"VlnCAMthresh533.pdf",sep=""),width = 5,height = 4)
plot2
dev.off()
# bar chart with the R > 0.553 threshold

df.filter2 <- cor1 %>%
    filter(!cell.label %in% c("Unassigned", "unassigned")) %>%
    group_by(cell.label) %>%
    filter(n() > min_cells)
  

  plot1b <-
    ggplot(df.filter2, aes(x = reorder(
      cell.label, cell.label, function(x) - length(x)),
      fill = cell.label)) + geom_bar() + theme_classic() +
    theme(axis.text.x = element_text(size = 12, colour = "black", angle = 90, hjust=0.99,vjust=0.5),
          axis.text.y = element_text(size = 12, colour = "black"),
          axis.title.x = element_text(size = 14, colour = "black"),
          axis.title.y = element_text(size = 14, colour = "black"),
          plot.margin = margin(15, 1, 1, 1)) +

    scale_y_continuous(expand = c(0, 0)) + # take the space away below the bars
    xlab('Assigned cell type') +
    ylab('number of cell') +
    labs(fill='Cell Types')
  

pdf(paste(fig_outs,"BarchartCAMthresh533.pdf",sep=""),width = 5,height = 4)
plot1b
dev.off()

plot1b  
plot2

```






Figure 3 clustering

This is with the subsample of 9000 cells per hMO
Figure 3D and E

```{r}
# read in the seurat objects
seu <- readRDS("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/9MBO/prepro_outsjan20-9000cells/Figure3/cluster_parameters/retro-louv-moreparm/retrotransLouvainSeuratObject60.Rds")
AB <- c("CD24","CD56","CD71","CD29","CD15", "CD184","CD133", "GLAST","CD44","AQP4","HepaCAM","CD140a", "O4" )


# the highest Rand Index are res = 0.1, 7 clusters, res = 0.15, 9 clusters.  Both very low RI std.
# the high RI with low std is 0.3 and 0.7, cluster numbers also have low std 
# from bootstrap 100X
# annotation is easier with 18-25 clusters 

# seu <- RunUMAP(seu,spread = 1, min.dist = 0.05, dims = 1:12)

DimPlot(seu, reduction = "umap", label = TRUE, group.by = 'RNA_snn_res.0.7', repel = TRUE)



```

Add different correlation assignments and visualize on UMAP from the low threshold for CAM

```{r, fig.width=5}

seu <- AddMetaData(object=seu, metadata= cor2$cell.label, col.name = 'cor.labels01')

# see the labels added
#unique(seu$cor.labels01)

# plot the cluster predictions
#plot_lab_clust(seu, seu$RNA_snn_res.0.7, seu$cor.labels01)

DimPlot(seu, group.by = 'cor.labels01', label = TRUE) + theme(legend.position = "none")

# removing the combined cell labels that are low frequency will improve visualization
# Check if the cell type has a frequency less than 500 and assign new label accordingly

# Check if the cell type has a frequency less than 500 and assign new label accordingly
cor2$cell.label.ft <- ifelse(cor2$cell.label %in% freq.cor2.df$cell.label[freq.cor2.df$Freq < 500], "few", cor2$cell.label)


seu <- AddMetaData(object=seu, metadata= cor2$cell.label.ft, col.name = 'cor.labels01ft')
unique(seu$cor.labels01ft)
plot_lab_clust(seu, seu$RNA_snn_res.0.7, seu$cor.labels01ft)

DimPlot(seu, group.by = 'cor.labels01ft', label = TRUE, 
        label.size = 8, repel = TRUE) + theme(legend.position = "none")


```
Visualize CAM with the signifance 0.553 threshold

```{r, fig.width=5}

seu <- AddMetaData(object=seu, metadata= cor1$cell.label, col.name = 'cor.labels05')

# see the labels added
unique(seu$cor.labels05)


DimPlot(seu, group.by = 'cor.labels05', label = TRUE) + theme(legend.position = "none")


```

Visualize CAM assignments with threshold R = 0.35

```{r, fig.width=5}

seu <- AddMetaData(object=seu, metadata= cor3$cell.label, col.name = 'cor.labels035')

# see the labels added
#unique(seu$cor.labels01)

# plot the cluster predictions
#plot_lab_clust(seu, seu$RNA_snn_res.0.7, seu$cor.labels01)

DimPlot(seu, group.by = 'cor.labels035', label = TRUE) + theme(legend.position = "none")

# removing the combined cell labels that are low frequency will improve visualization
# Check if the cell type has a frequency less than 500 and assign new label accordingly

# Check if the cell type has a frequency less than 500 and assign new label accordingly
cor3$cell.label.ft <- ifelse(cor3$cell.label %in% freq.cor3.df$cell.label[freq.cor3.df$Freq < 200], "few", cor3$cell.label)


seu <- AddMetaData(object=seu, metadata= cor3$cell.label.ft, col.name = 'cor.labels035ft')
unique(seu$cor.labels035ft)

DimPlot(seu, group.by = 'cor.labels035ft', label = TRUE, 
        label.size = 8, repel = TRUE) + theme(legend.position = "none")


```

Get the top assigned cells per cluster for each threshold

```{r}
cor.ann.035 <- get_annotation(seu, seu.cluster = seu$RNA_snn_res.0.7, 
                          seu.label = seu$cor.labels035, top_n = 3, 
                          filter_out = c("Unknown","unknown","Mixed", 
                                         "unassigned","Unassigned"), 
                          Label = "CAM")
cor.ann.01 <- get_annotation(seu, seu.cluster = seu$RNA_snn_res.0.7, 
                          seu.label = seu$cor.labels01, top_n = 3, 
                          filter_out = c("Unknown","unknown","Mixed","
                                         unassigned","Unassigned"), 
                          Label = "CAM")
cor.ann.05 <- get_annotation(seu, seu.cluster = seu$RNA_snn_res.0.7, 
                          seu.label = seu$cor.labels05, top_n = 3, 
                          filter_out = c("Unknown","unknown","Mixed",
                                         "unassigned","Unassigned"), 
                          Label = "CAM")


```

Visualize Marker expression

```{r}
length(unique(seu$RNA_snn_res.0.7))
# 19
# if we want to plot by cluster we need a vector of from 0 to the n-1 clusters
cluster.num <- c(0:18)

plotmean(plot_type = 'heatmap',seu = seu, group = 'RNA_snn_res.0.7', markers = AB, 
                     var_names = cluster.num, slot = 'scale.data', xlab = "Cluster",
         ylab = "Antibody")


```
Feature plots

```{r}
Idents(seu) <- "RNA_snn_res.0.7"

for (i in AB) {
  print(FeaturePlot(seu, features = i, min.cutoff = 'q1', max.cutoff = 'q97', label = TRUE))
}


```




```{r}


# annotate cells 
Idents(seu) <- "RNA_snn_res.0.7"
cluster.ids <- c("Unassigned","Glial-lineage","Neurons 1","Radial Glia 1",
            "Radial Glia 3","Epithelial","Neurons 2",
            "Astrocytes 1","Astrocytes 1",
                 "Astrocytes 2","Neurons 2","NPC","Radial Glia 1",
            "Radial Glia 2",
                 "Endothelial","Oligodendrocytes","OPC","Stem cell like",
                 "NPC")

names(cluster.ids) <- levels(seu)
seu <- RenameIdents(seu, cluster.ids)
seu$labels <- Idents(seu)

DimPlot(seu, label = TRUE)
# with more subgroups
Idents(seu) <- "labels"
levels(seu)

# there are 16 levels in the cell type annotations with major groups

# change the order of the cell types on the legend of the umap
cell.type.order <- c("Astrocytes 1", "Astrocytes 2","Radial Glia 1","Radial Glia 2",
                     "Radial Glia 3","Glial-lineage",
                     "Epithelial","Endothelial",
                     "NPC","Neurons 1","Neurons 2",
                     "Oligodendrocytes","OPC","Stem cell like","Unassigned")
cell.type.order <- rev(cell.type.order)

# colour order to match cell type order
clust.colours <- c("chocolate2","darkorange","pink","coral1","lightpink3",
                   "mistyrose2",
                   "steelblue3","deepskyblue",
                   "mediumpurple1","purple","plum3",
                   "seagreen3","olivedrab4","tomato3","burlywood3")
                   
                   
 #                  "plum1","purple","magenta3","mediumpurple1","darkorchid","plum3",
#             "steelblue3","darkorange1","orange1","lightcoral","coral1","orangered1","lightsalmon",
 #            "cyan")

# Figure 3D UMAP with annotated clusters
# use PDF for figure for correct resolution
pdf(paste(fig_outs,"UMAPlabelled9000.noraster.pdf"),width = 9, height = 5)
 DimPlot(seu, order = cell.type.order, cols = clust.colours, shuffle = TRUE, raster=FALSE,pt.size = 0.1, label = FALSE) +
   theme(legend.text = element_text(size=16), axis.title.y = element_text(size=16), 
         axis.title.x = element_text(size=16), axis.text.y = element_text(size =16),
        axis.text.x = element_text(size =16))
 dev.off()

DimPlot(seu, order = cell.type.order, cols = clust.colours, shuffle = TRUE, raster=FALSE,pt.size = 0.1, label = TRUE)

######## figure 3E heatmap of the 

# reorder the bars to match the UMAP
levels(seu) <- c("Astrocytes 1", "Astrocytes 2","Radial Glia 1","Radial Glia 2",
                     "Radial Glia 3","Glial-lineage",
                     "Epithelial","Endothelial",
                     "NPC","Neurons 1","Neurons 2",
                     "Oligodendrocytes","OPC","Stem cell like","Unassigned")

pdf(paste(fig_outs,"HM9000_fig3E.pdf"),width = 8.5, height = 5)
DoHeatmap(seu, features = AB, size= 6,slot = "scale.data", group.colors = clust.colours, disp.max = 2, disp.min = -1.5,
          angle = 90) + scale_fill_gradientn(colors = c("#154c79", "#eeeee4", "#e28743")) + 
  theme(axis.text.y = element_text(size = 15))
dev.off()

```

```{r}
saveRDS(seu,"/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/NatMethodJuneSubmission/Seu9000lablesJune23.RDS")

seu <-readRDS("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/NatMethodJuneSubmission/Seu9000lablesJune23.RDS") 

```

Train a Random Forest classifier

```{r}

# get the names of hte meta data to know the annotations to call
colnames(seu@meta.data)

DimPlot(seu, group.by = "labels")

```
```{r}

markers <- rev(c("CD24","CD56","CD29","CD15","CD184","CD133","CD71","CD44","GLAST","AQP4","HepaCAM", "CD140a","O4"))
rf <- RFM_train(seurate_object = seu, 
                             AB_list = markers, annotations = seu$labels,
                      split = c(0.5,0.5),
                      downsample = "none",
                      seed = 222,
                      mytry = c(1:10),
                      maxnodes = c(10: 20),
                      trees = c(250, 500, 1000,2000),
                      start_node = 15)
```


